\documentclass[11pt]{article}
\usepackage{graphicx} % more modern
%\usepackage{times}
\usepackage{helvet}
\usepackage{courier}
\usepackage{epsf}
\usepackage{amsmath,amssymb,amsfonts,verbatim}
\usepackage{subfigure}
\usepackage{amsfonts}
\usepackage{amsmath}
\usepackage{latexsym}
\usepackage{algpseudocode}
\usepackage{algorithm}
\usepackage{enumerate}
%\usepackage{algorithmic}
\usepackage{multirow}
\usepackage{xcolor}

\def\A{{\bf A}}
\def\a{{\bf a}}
\def\B{{\bf B}}
\def\b{{\bf b}}
\def\C{{\bf C}}
\def\c{{\bf c}}
\def\D{{\bf D}}
\def\d{{\bf d}}
\def\E{{\bf E}}
\def\e{{\bf e}}
\def\F{{\bf F}}
\def\f{{\bf f}}
\def\G{{\bf G}}
\def\g{{\bf g}}
\def\k{{\bf k}}
\def\K{{\bf K}}
\def\H{{\bf H}}
\def\I{{\bf I}}
\def\L{{\bf L}}
\def\M{{\bf M}}
\def\m{{\bf m}}
\def\n{{\bf n}}
\def\N{{\bf N}}
\def\BP{{\bf P}}
\def\R{{\bf R}}
\def\BS{{\bf S}}
\def\s{{\bf s}}
\def\t{{\bf t}}
\def\T{{\bf T}}
\def\U{{\bf U}}
\def\u{{\bf u}}
\def\V{{\bf V}}
\def\v{{\bf v}}
\def\W{{\bf W}}
\def\w{{\bf w}}
\def\X{{\bf X}}
\def\Y{{\bf Y}}
\def\Q{{\bf Q}}
\def\x{{\bf x}}
\def\y{{\bf y}}
\def\Z{{\bf Z}}
\def\z{{\bf z}}
\def\0{{\bf 0}}
\def\1{{\bf 1}}


\def\hx{\hat{\bf x}}
\def\tx{\tilde{\bf x}}
\def\ty{\tilde{\bf y}}
\def\tz{\tilde{\bf z}}
\def\hd{\hat{d}}
\def\HD{\hat{\bf D}}

\def\MA{{\mathcal A}}
\def\MF{{\mathcal F}}
\def\MR{{\mathcal R}}
\def\MG{{\mathcal G}}
\def\MI{{\mathcal I}}
\def\MN{{\mathcal N}}
\def\MO{{\mathcal O}}
\def\MT{{\mathcal T}}
\def\MX{{\mathcal X}}
\def\SW{{\mathcal {SW}}}
\def\MW{{\mathcal W}}
\def\MY{{\mathcal Y}}
\def\BR{{\mathbb R}}
\def\BP{{\mathbb P}}

\def\bet{\mbox{\boldmath$\beta$\unboldmath}}
\def\epsi{\mbox{\boldmath$\epsilon$}}

\def\etal{{\em et al.\/}\,}
\def\tr{\mathrm{tr}}
\def\rk{\mathrm{rk}}
\def\diag{\mathrm{diag}}
\def\dg{\mathrm{dg}}
\def\argmax{\mathop{\rm argmax}}
\def\argmin{\mathop{\rm argmin}}
\def\vecd{\mathrm{vec}}

\def\ph{\mbox{\boldmath$\phi$\unboldmath}}
\def\vp{\mbox{\boldmath$\varphi$\unboldmath}}
\def\pii{\mbox{\boldmath$\pi$\unboldmath}}
\def\Ph{\mbox{\boldmath$\Phi$\unboldmath}}
\def\pss{\mbox{\boldmath$\psi$\unboldmath}}
\def\Ps{\mbox{\boldmath$\Psi$\unboldmath}}
\def\muu{\mbox{\boldmath$\mu$\unboldmath}}
\def\Si{\mbox{\boldmath$\Sigma$\unboldmath}}
\def\lam{\mbox{\boldmath$\lambda$\unboldmath}}
\def\Lam{\mbox{\boldmath$\Lambda$\unboldmath}}
\def\Gam{\mbox{\boldmath$\Gamma$\unboldmath}}
\def\Oma{\mbox{\boldmath$\Omega$\unboldmath}}
\def\De{\mbox{\boldmath$\Delta$\unboldmath}}
\def\de{\mbox{\boldmath$\delta$\unboldmath}}
\def\Tha{\mbox{\boldmath$\Theta$\unboldmath}}
\def\tha{\mbox{\boldmath$\theta$\unboldmath}}

\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}{Lemma}[section]
\newtheorem{definition}{Definition}[section]
\newtheorem{proposition}{Proposition}[section]
\newtheorem{corollary}{Corollary}[section]
\newtheorem{example}{Example}[section]


\def\probin{\mbox{\rotatebox[origin=c]{90}{$\vDash$}}}

\def\calA{{\cal A}}



%this is a comment

%use this as a template only... you may not need the subsections,
%or lists however they are placed in the document to show you how
%do it if needed.


%THINGS TO REMEMBER
%to compile a latex document - latex filename.tex
%to view the document        - xdvi filename.dvi
%to create a ps document     - dvips filename.dvi
%to create a pdf document    - dvipdf filename.dvi
%{\bf TEXT}                  - bold font TEXT
%{\it TEXT}                  - italic TEXT
%$ ... $                     - places ... in math mode on same line
%$$ ... $$                   - places ... in math mode on new line
%more info at www.cs.wm.edu/~mliskov/cs423_fall04/tex.html


\setlength{\oddsidemargin}{.25in}
\setlength{\evensidemargin}{.25in}
\setlength{\textwidth}{6in}
\setlength{\topmargin}{-0.4in}
\setlength{\textheight}{8.5in}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\notes}[5]{
	\renewcommand{\thepage}{#1 - \arabic{page}}
	\noindent
	\begin{center}
	\framebox{
		\vbox{
		\hbox to 5.78in { { \bf Statistical Machine Learning}
		\hfill #2}
		\vspace{4mm}
		\hbox to 5.78in { {\Large \hfill #5 \hfill} }
		\vspace{2mm}
		\hbox to 5.78in { {\it #3 \hfill #4} }
		}
	}
	\end{center}
	\vspace*{4mm}
}

\newcommand{\ho}[5]{\notes{#1}{Distributions}{Professor: Zhihua Zhang}{}{Lecture Notes #1: Scale Mixture Distribution}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%begins a LaTeX document
\setcounter{section}{2}
\setcounter{subsection}{0}
\setcounter{theorem}{2}
\setcounter{definition}{2}
\setcounter{example}{4}
\begin{document}
\ho{2}{2014.03.08}{Moses Liskov}{Name}{Lecture title}


\subsection{Distribution Function}
The CDF of a discrete random variable X can be expressed as the sum of its probability mass function(pmf) $f_X(x)$ as follows:$$F(x) = \sum_{x_i\le x}f_X(x_i)$$
The CDF of a continuous random variable X can be expressed as the integral of its probability density function(pdf) $f_X(x)$ as follows:$$F(x) = \int_{-\infty}^{x}f_X(t)\mathrm dt$$ and $$F'(x) = f_X(x)$$
\begin{lemma}
Let $F$ be the CDF for a random variable X, then we have
\begin{enumerate}[(1)]
\item
$\Pr(X=x)=F(x)-F(x^{-})$
\item
$\Pr(x<X\le y)=F(y)-F(x)$
\item
$\Pr(X>x)=1-F(x)$
\item
If $X$ is continuous, then $$F(b)-F(a)=\Pr(a<X<b)=\Pr(a\le X<b)=\Pr(a<X\le b)=\Pr(a\le X\le b)$$
\end{enumerate}
\end{lemma}
\begin{definition}
Suppose X is a random variable with CDF $F(x)$. The inverse CDF is defined by: $$F^{-1}(q)=inf\{x:F(x)>q\}$$ for $q \in [0,1]$. It's is also called \textbf{quantile function}.
\end{definition}
\begin{definition}
The \textbf{mode} of a discrete probability distribution is the value at which its pmf takes its maximum value. The mode of a continuous probability distribution is the value x at which its probability density function has its maximum value, so, informally speaking, the mode is at the peak.
\end{definition}
{\bf Remarks:}
\begin{enumerate}[(1)]
\item
The pmf is always less than or equal to 1, but the pdf can be greater than 1. For example, the uniform distribution on $[0,1/5]$, the pdf is $f(x)=5$. The pdf also can be infinite, e.g., $f(x) = \frac{2}{3}x^{-\frac{1}{3}}$.
\item
$\sum{f(x)}=1$ or $\int{f(x)}=1$ sometimes is written as $\int{\mathrm dF(x)}=1$ or $\int{F(\mathrm dx)}=1$. 
\item
We call $X$ and $Y$ are equal in distribution iff $F_{X}(x)=F_{Y}(x)$ for any $x$. Notice that it is \textbf{not} the same as $X=Y$. For example,  $\Pr(X=1)=\Pr(X=-1)=\frac{1}{2}$. Let $Y=-X$, then $X$ and $Y$ are equal in distribution but $X\neq Y$.
\end{enumerate}
\subsection{Discrete Distribution Examples}
\subsubsection{Uniform Discrete Distribution}
Random variable $X \in \{x_1, x_2, ..., x_n\}$ has a uniform discrete distribution pmf $f$ if 
\[f(x) = \left\{\begin{array}{cc}
\frac{1}{n}  &  x = x_i, i = 1, 2, ..., n \\
0                &  \text{otherwise}
\end{array}
\right.\] 

\subsubsection{Point Mass Distribution}
Random variable $X$ has a point mass distribution pmf $f$ if 
\[f(x) = \left\{\begin{array}{cc}
1 & x= a \\
0 & \text{otherwise}
\end{array}
\right.
\]

\subsubsection{Bernoulli Distribution}
Random variable $X$ has a Bernoulli distribution pmf $f$ if 
\[f(x) = \left\{\begin{array}{cc}
p & x= a \\
1-p & \text{otherwise}
\end{array}
\right.
\]
where $p \in [0,1]$. It can be written as $f(x)=p^x(1-p)^{1-x}$ also. In binary classification problem, Bernoulli distribution is always used to model the category $y = f(x)$. If $y>0.5$, it's in class 1, else in class 2.

\subsubsection{Poisson Distribution}
A discrete random variable $X$ is said to have a Poisson distribution with parameter $\lambda > 0$, 
if, for $k = 0, 1, 2, ...$, the probability mass function of $X$ is given by:
\[f(x; \lambda) = \Pr(X=x)= e^{-\lambda} \frac{\lambda^x }{x!},\quad x\ge 0\].

It is easy to validate $\sum{f(x)}=1$ by the Taylor expansion of $e^\lambda$.

{\bf Remark}: If $X_1 \sim \mbox{Poisson}(\lambda_1)$, $X_2 \sim \mbox{Poisson}(\lambda_2)$, 
then $X_1 + X_2 \sim \mbox{Poisson}(\lambda_1 + \lambda_2)$.

\subsubsection{Binomial Distribution}
A discrete random variable $X$ is said to have a binomial distribution with parameter $n$ and $p$, we write $X \sim \mbox{Binomial}s(n,p)$. The probability mass function is given by:
\[f(x;n,p) = \Pr(X = x) = {n\choose x}p^x(1-p)^{n-x}\]
for $k=0,1,2,...,n$, where ${n \choose k} = \frac{n!}{k!(n-k)!}$ is the binomial coefficient. It can be interpreted that the probability of exact $k$ successes after $n$ trials.

{\bf Remark}: If $X_1 \sim \mbox{Binomial}(n_1,p)$, $X_2 \sim \mbox{Binomial}(n_2,p)$, 
then $X_1 + X_2 \sim \mbox{Binomial}(n_1+n_2,p)$.

By the way, we introduce something about gamma function and a generalization form of ${n\choose k}$.

The gamma function (represented by the capital Greek letter $\Gamma$) is an extension of the factorial function, with its argument shifted down by 1, to real and complex numbers. That is, if $n$ is a positive integer:
$$\Gamma(n)=(n-1)!$$

The gamma function is defined for all complex numbers except the negative integers and zero. For complex numbers with a positive real part, it is defined via a convergent improper integral:
$$\Gamma(x) = \int_0^\infty t^{x-1} e^{-t}\,{\rm d}t$$.

As a generalization of factorial function, $\Gamma(x+1)=x\Gamma(x)$, $\Gamma(1)=0!=1$ and $ \Gamma(\frac{1}{2})=\sqrt{\pi}$.

Also, we can define ${n\choose k}$ when n is a real number and k is a integer:
\[{n\choose k} = \left\{\begin{array}{cc}
\frac{n(n-1)\dots(n-k+1)}{k!} & k\ge 0 \\
0 & k<0
\end{array}
\right.
\]
Then we can get a new binomial theorem:$(1+z)^n = \sum_{k}{n \choose k}z^k$, $|z|<1$. It can be proved by Taylor expansion also.
\subsubsection{Negative Binomial Distribution}
Suppose there is a sequence of independent Bernoulli trials, each trial having two potential outcomes called ``success" and ``failure". In each trial the probability of success is $p$ and of failure is $1-p$. We are observing this sequence until a predefined number $r$ of failures has occurred. Then the random number of successes we have seen, $X$, will have the negative binomial (or Pascal) distribution:
\[X \sim \mbox{NB}(r, p).\]

The probability mass function of the negative binomial distribution is:
\[f(k; r,p) = \Pr(X = k) = {k+r-1 \choose k} p^k (1-p)^r\]
for $k = 0, 1, 2, ...$

Note that \begin{eqnarray*}
{k+r-1 \choose k}&=&\frac{(k+r-1)(k+r-2)\dots r}{k!} \\
&=&\frac{(-1)^{k}(-r)(-r-1)\dots(-r-k+1)}{k!}\\
&=&(-1)^k{-r \choose k}\\
\end{eqnarray*}
That's why it's called negative binomial distribution. Hence, $$\sum{\Pr(X=k)}=(1-p)^r\sum{(-1)^k{-r \choose k}p^k}=(1-p)^r(1-p)^{-r}=1$$

When $r=1$, the negative binomial distribution is \textbf{geometric distribution}:$\Pr(X = k) = (1-p)^{k-1}\,p$.

Let $p=\frac{\lambda}{\lambda+r}$. If $r \rightarrow \infty$, then $p \rightarrow 0$. We can get Poisson distribution:
\begin{eqnarray*}
\lim_{r\rightarrow\infty}f(\lambda) &=& \lim_{r\rightarrow\infty}\frac{(k+r-1)\dots r}{k!}\left(\frac{\lambda}{r+\lambda}\right)^k\left(\frac{r}{r+\lambda}\right)^r \\
&=&\lim_{r\rightarrow\infty}\lambda^k\frac{(k+r-1)\dots r}{k!}\left(\frac{1}{r+\lambda}\right)^k\left(\frac{1}{\frac{\lambda}{r}+1}\right)^r \\
&=&\lim_{r\rightarrow\infty}\frac{\lambda^k}{k!}\frac{(k+r-1)\dots r}{(\lambda+r)^k}\frac{1}{\left(1+\frac{\lambda}{r}\right)^r} \\
&=&\frac{\lambda^k}{k!}e^{-\lambda} \\
\end{eqnarray*} 

\textbf{Bernoulli Distribution and Measure}
Let $\Omega=[0, 1]$, $P([a,b])=b-a$, $0\le a\le b\le 1$(Lebesgue measure).
Fix $P \in (0,1)$ and let \[X(\omega) = \left\{\begin{array}{cc}
1 & \omega\le p \\
0 & \omega > p \\
\end{array}
\right.
\]
Hence, $\Pr(X=1)=\Pr(\omega\le p)=\Pr([0,p])=p$, $\Pr(X=1)=\Pr(\omega> p)=\Pr((p,1])=1-p$.

\textbf{Homework}:
\begin{enumerate}[(1)]
\item
If $\lim \limits_{n\rightarrow\infty}a_n=a$, show that $\lim \limits_{n\rightarrow\infty}\left(1+\frac{a_n}{n}\right)^n=e^{a}$.
\item
Prove the Stirling Formula.
\begin{eqnarray*}
\lim_{p\rightarrow\infty}\frac{\ln\Gamma(p)}{\frac{1}{2}\ln(2\pi)+(p-\frac{1}{2})\ln{p}-p}&=&1\\
\lim_{p\rightarrow\infty}\frac{\Gamma(p)}{(2\pi)^{\frac{1}{2}}p^{p-\frac{1}{2}}e^{-p}}&=&1
\end{eqnarray*}
\end{enumerate}
\subsection{Continuous Distribution Examples}
\subsubsection{Continuous Uniform Distribution}
A continuous random variable $X$ is said to have a uniform distribution in $[a,b]$, 
if the probability density function is given by:
\[ f(x)=\begin{cases}
  \frac{1}{b - a} &  a \leq x \leq b \\
  0 & \mbox{otherwise}
  \end{cases} \]

\subsubsection{Normal(Gaussian) Distribution}
A continuous random variable $X$ is said to have a Gaussian distribution with parameter $\mu$ and $\sigma$, if the probability density function of $X$ is given by:
\[f(x; \mu, \sigma) = \frac{1}{\sqrt{2\pi}\sigma} e^{-\frac{(x-\mu)^2}{2\sigma^2}}\]
denoted as $X\sim \MN(\mu, \sigma^2)$. The cumulative distribution function of Gaussian random variable $X$ with parameter $\mu = 0$ and $\sigma = 1$ ($X\sim \MN(0,1)$)is:
\[\Phi(z) = \Pr(X < z) = \int_{-\infty}^z \frac{1}{\sqrt{2\pi}} e^{-\frac{t^2}{2}} \mathrm dt\]

\subsubsection{Dirac Distribution}
The Dirac function, or $\delta$ function can be loosely thought of as a function on the real line which is zero everywhere except at the origin, where it is infinite,
$$\delta(x) = \begin{cases} +\infty, & x = 0 \\ 0, & x \ne 0 \end{cases}$$
and which is also constrained to satisfy the identity
$$\int_{-\infty}^\infty \delta(x) \, \mathrm dx = 1$$
\subsubsection{Exponential Power Distribution}
A random variable $X$ is said to have an exponential power distribution with parameter $\mu$, $\sigma$, $q$ if its probability density function is :
$$
f(x)=\frac{1}{2^{\frac{q+1}{q}}\Gamma(\frac{q+1}{q})\sigma} \; e^{\left(-\frac{1}{2}|\frac{x-\mu}{\sigma}|^q\right)}
$$
where $\mu \in \BR$, $\sigma > 0$, $q > 0$.

This family includes the normal distribution when $q=2$ and it includes the Laplace distribution when $q=1$: $f(x)=\frac{1}{4\sigma} e^{-\frac{|x-\mu|}{2\sigma}}$

To validate $\int{f(x)}=1$, the following formulas may help. For $a>0, p>0$,
\begin{eqnarray*}
\int_{0}^{\infty}x^{p-1}e^{-ax}\mathrm dx &=& a^{-p}\Gamma(p) \\
\int_{0}^{\infty}x^{-(p+1)}e^{-ax^{-1}}\mathrm dx &=& a^{-p}\Gamma(p) \\
\int_{0}^{\infty}x^{p-1}e^{-ax^2}\mathrm dx &=& \frac{1}{2}a^{-\frac{p}{2}}\Gamma(\frac{p}{2}) \\
\int_{0}^{\infty}x^{-(p+1)}e^{-ax^{-2}}\mathrm dx &=& \frac{1}{2}a^{-\frac{p}{2}}\Gamma(\frac{p}{2}) \\
\end{eqnarray*}
More generally, for $a>0, p>0$,
\begin{eqnarray*}
\int_{0}^{\infty}x^{p-1}e^{-ax^q}\mathrm dx &=& \frac{1}{q}a^{-\frac{p}{q}}\Gamma(\frac{p}{q}) \\
\int_{0}^{\infty}x^{-(p+1)}e^{-ax^{-q}}\mathrm dx &=& \frac{1}{q}a^{-\frac{p}{q}}\Gamma(\frac{p}{q}) \\
\end{eqnarray*}
\subsubsection{Generalized Inverse Gaussian Distribution}
A continuous random variable $X$ is said to have generalized inverse Gaussian distribution(GIG) with parameters $\alpha$, $\beta$, $r$, if the probability density function of $X$ is given by:
\[f(x) = \frac{(\alpha / \beta)^{r/2}}{2 K_r(\sqrt{\alpha\beta})} x^{r-1} e^{-(\alpha x + \beta/x)/2}, x > 0\]
where $K_r$ is a modified Bessel function of second kind with index $r$, $\alpha > 0$, $\beta > 0$.

\textbf{Properties of Bessel Function}
\begin{enumerate}[(1)]
\item
$K_r(u)=K_{-r}(u)$
\item
$K_{r+1}(u)=2\frac{r}{u}K_{r}(u)+K_{r-1}(u)$
\item
$K_{1/2}(u)=K_{-1/2}(u)=\sqrt{\frac{\pi}{2u}}e^{-u}$
\item
$u\rightarrow0 \,\, \begin{cases}
  K_r(u) \sim \frac{1}{2}\Gamma(r)\left(\frac{u}{2}\right)^{-r} \\
  K_0(u) \sim \ln{u}
  \end{cases}$
\item
$u\rightarrow\infty, K_r(u)\sim\sqrt{\frac{\pi}{2u}}e^{-u}$
\end{enumerate}

\textbf{Gamma Distribution}

Specially, when $\beta=0, \alpha>0, r>0$, $X\sim \mbox{Gamma}(r,\frac{\alpha}{2})$,
$$
f(x) = \frac{\alpha^r}{2^r\Gamma(r)} x^{r-1} e^{- \frac{\alpha x}{2} }
$$
when $r=1$, it's exponential distribution. If $X_i\sim  \mbox{Gamma}(r_i,\alpha)$, then $\sum_{i=1}^{n}X_i\sim \mbox{Gamma}({\sum_{i=1}^n{r_i},\alpha})$.

\textbf{Inverse Gamma Distribution}

Specially, when $\alpha=0, r<0, \beta>0$, $X\sim \mbox{Inv-Gamma}(r,\frac{\beta}{2})$,
$$
f(x) = \frac{\beta^\tau}{2^\tau\Gamma(\tau)} x^{-(\tau+1)} e^{- \frac{\beta}{2x} },\tau=-r
$$

\textbf{Inverse Gaussian}

Specially, when $r=-\frac{1}{2}$,
\begin{eqnarray*}
f(x) &=& \left(\frac{\beta}{2\pi}\right)^{\frac{1}{2}}\exp({\sqrt{2\beta}})x^{-\frac{3}{2}}\exp(-\frac{\alpha x+x^{-1}\beta}{2})\\
&=& \left[\frac{\lambda}{2 \pi x^3}\right]^{1/2} \exp{\frac{-\lambda (x-\mu)^2}{2 \mu^2 x}} 
\end{eqnarray*}
where $\alpha = \lambda/\mu^2, \beta = \lambda$.

\subsubsection{Chi-Squared Distribution}
A continuous random variable $X$ is said to have chi-squared distribution, if the probability density function of $X$ is given by:
\[f(x) = \frac{1}{\Gamma(\frac{p}{2})2^{\frac{p}{2}}} x^{\frac{p}{2} - 1} e^{-\frac{x}{2}}, x>0\]
Note that $\|\boldsymbol{N}_{i=1,...,k}{(0,1)}\|^2 \sim \chi^2_k$ (The squared norm of k standard normally distributed variables is a chi-squared distribution with $k$ degrees of freedom)

\subsubsection{Beta Distribution}
A continuous random variable $X$ is said to have beta distribution, if the probability density function of $X$ is given by:
\[f(x) = \frac{\Gamma(\alpha + \beta)}{\Gamma(\alpha)\Gamma(\beta)} x^{\alpha - 1} (1-x)^{\beta - 1}\]
The beta function is defined as:
$$
\mbox{Beta}(p,q)=\frac{\Gamma(p)\Gamma(q)}{\Gamma(p+q)}
$$
When $\alpha=1,\beta=1$, it is uniform distribution on $[0,1]$.

\subsubsection{Student's t-distribution}
A continuous random variable $X$ is said to have Student's t-distribution($X\sim t_\nu$), if the probability density function of $X$ is given by:
$$
f(t) = \frac{\Gamma(\frac{\nu+1}{2})}{\Gamma(\frac{\nu}{2})} \frac{1}{\left(1+\frac{(x-\mu)^2}{\nu\sigma^2}\right)^{\frac{\nu+1}{2}}} \frac{1}{\sqrt{\nu\pi}/\sigma}
$$
When $\nu=1$, it is Cauchy distribution and when $\nu\rightarrow\infty$, $t\rightarrow\MN$.

It can be shown that the t-distribution is like an infinite sum of Gaussians, where each Gaussian has a different variance:
$$
\int_{0}^{\infty}\MN(x\mid\mu,(\lambda\tau)^{-1})\mbox{Gamma}(\tau\mid\frac{\nu}{2},\frac{\nu}{2})=t_\nu(x\mid\mu,\lambda^{-1})
$$
This means t-distribution is a scale mixture of normal distribution.It results from compounding a Gaussian distribution with mean $\mu$ and unknown precision (the reciprocal of the variance), with a gamma distribution placed over the precision with parameters $r = \nu/2$ and $\alpha/2 = \nu/2$. In other words, the random variable $X$ is assumed to have a normal distribution with an unknown precision distributed as gamma, and then this is marginalized over the gamma distribution.

\begin{example}
Suppose $X\sim Bernoulli(\theta)$, $\theta\sim Beta(\alpha,\beta)$.
\begin{eqnarray*}
p(\theta\mid x)&\propto& p(x\mid\theta)p(\theta\mid\alpha,\beta)\\
&\propto& \theta^x(1-\theta)^{1-x}\theta^{\alpha-1}(1-\theta)^{\beta-1} \\
&\propto& \theta^{x+\alpha-1}(1-\theta)^{\beta-x} \\
&\sim& Beta(x+\alpha, \beta-x+1) \\
\end{eqnarray*}
We say that beta distribution is the conjugate prior for the Bernoulli distribution. Generally, if the posterior distributions $p(\theta\mid x)$ are in the same family as the prior probability distribution $p(\theta)$, the prior and posterior are then called conjugate distributions, and the prior is called a conjugate prior for the likelihood function.
\end{example}

\begin{example}
Suppose $X\sim \MN(0,\lambda)$.
$$
f(x) = \frac{1}{\sqrt{2\pi}}\lambda^{-\frac{1}{2}}\exp(-\frac{x^2}{2\lambda})
$$

If $\lambda\sim \mbox{Gamma}(r,\alpha/2)$,
\begin{eqnarray*}
p(\lambda\mid x)&\propto& p(x\mid\lambda)p(\lambda\mid r,\alpha/2)\\
&\propto& \lambda^{-\frac{1}{2}}\exp({-\frac{x^2}{2\lambda}})\lambda^{r-1}\exp(-\frac{\alpha\lambda}{2}) \\
&\propto& \lambda^{r-3/2}\exp(-\frac{1}{2}(\frac{x^2}{\lambda}+\alpha\lambda)) \\
\end{eqnarray*}
It is generalized inverse Gaussian distribution, but the prior and posterior are not conjugate distributions. 

If $\lambda\sim \mbox{Inv-Gamma}(\tau,\beta/2)$,
\begin{eqnarray*}
p(\lambda\mid x)&\propto& p(x\mid\lambda)p(\lambda\mid \tau,\beta/2)\\
&\propto& \lambda^{-\frac{1}{2}}\exp({-\frac{x^2}{2\lambda}})\lambda^{-(\tau+1)}\exp(-\frac{\beta}{2\lambda}) \\
&\propto& \lambda^{-(\tau+1)-1/2}\exp(-\frac{1}{2}(\frac{x^2}{\lambda}+\frac{\beta}{\lambda})) \\
&\sim&\mbox{Inv-Gamma}(\tau+1/2,\beta+x^2) \\
\end{eqnarray*}
Hence, Inv-Gamma is a conjugate prior for the Gaussian distribution with known mean.
\end{example}
\end{document}




